This package downloads pollution data for the Mexico City metro area. It can download real-time, daily maximum, minimum, or hourly average data for each of the pollution (and wind) measuring stations or geographical zones in the Zona Metropolitana del Valle de México.
For the moment this package is only available from github. For the development version:
if (!require(devtools)) {
install.packages("devtools")
}
devtools::install_github('diegovalle/aire.zmvm')
The package consists mainly of three functions:
get_station_data to download data for each of the pollution (and wind and temperature) measuring stations in the original measuring units.get_zone_data to download data for each of the 5 geographic zones of Mexico City as measured in IMECAsget_latest_data to download the latest values for each of the pollution measuring stations.library("aire.zmvm")
library("dplyr")
library("ggplot2")
library("mgcv")
library("lubridate")
library("stringr")
o3 <- get_station_data(criterion = "MAXIMOS", # Can be one of MAXIMOS (daily maximum),
# MINIMOS (daily minimum),
# or HORARIOS (hourly average)
pollutant = "O3", # Can be one of "SO2", "CO", "NOX", "NO2", "NO", "O3",
# "PM10", "PM25", "WSP", "WDR", "TMP", "RH"
year = 2005:2016) # A numeric vector, the earliest year allowed is 1986
# Daily max among all base stations
o3_max <- o3 %>%
group_by(date) %>%
summarise(max = ifelse(all(is.na(value)),
NA,
base::max(value, na.rm = TRUE))) %>%
na.omit()
# Plot the daily highest pm10 level with trendline
ggplot(o3_max,
aes(date, max, group = 1)) +
geom_point(color = "black", size = .2, alpha = .4) +
geom_smooth(method = "gam", formula = y ~ s(x, k = 25)) +
ggtitle("Daily maximum O3 levels") +
ylab("maximum daily O3 value in ppb") +
xlab("date") +
geom_vline(xintercept = as.numeric(as.Date("2015-07-01"))) +
annotate("text", label = "supreme court ruling",
x = as.Date("2014-01-20"),
y = 350) +
theme_bw()
The function get_station_data can be used to download pollution and wind data going back to 1986, altough not all pollutants are available that far back, and requesting a pollutant that is not available will result in a warning.
download_data <- function(pollutant) {
print(paste0("Downloading data for: ", pollutant))
suppressWarnings({
o3 <- get_station_data(criterion = "MAXIMOS", # Can be MAXIMOS (daily maximum),
# MINIMOS (daily minimum),
# or HORARIOS (hourly average)
pollutant = pollutant, # "SO2", "CO", "NOX", "NO2", "NO", "O3",
# "PM10", "PM25", "WSP", "WDR", "TMP", "RH"
year = 1986:2016) # The earliest year allowed is 1986
})
o3$pollutant <- pollutant
# Daily max among all base stations
o3 %>%
group_by(date, pollutant) %>%
summarise(max = ifelse(all(is.na(value)),
NA,
base::max(value, na.rm = TRUE))) %>%
na.omit()
}
ll <- mapply(download_data,
pollutant = c("NO", "SO2", "CO", "NOX", "NO2", "O3", "PM10", "PM25"),
SIMPLIFY = FALSE)
## [1] "Downloading data for: NO"
## [1] "Downloading data for: SO2"
## [1] "Downloading data for: CO"
## [1] "Downloading data for: NOX"
## [1] "Downloading data for: NO2"
## [1] "Downloading data for: O3"
## [1] "Downloading data for: PM10"
## [1] "Downloading data for: PM25"
df_maximos <- bind_rows(ll)
knitr::kable(head(df_maximos))
| date | pollutant | max |
|---|---|---|
| 2003-01-01 | NO | 201 |
| 2003-01-02 | NO | 228 |
| 2003-01-03 | NO | 229 |
| 2003-01-04 | NO | 301 |
| 2003-01-05 | NO | 230 |
| 2003-01-06 | NO | 275 |
We can plot all pollutants going back to 1986 and add a trend line based on a GAM controlling for month of year, with lines for the start of the hoy no circula and the supreme court decision that the hoy no circula doesn’t necesarlly apply to older cars. A more complex model is left as an execise to the reader.
fitted <- function(max, month, date, year) {
df=data.frame(max = max,
month = month,
date = date,
year = year)
fit <- gam(max ~ s(month, bs = "cc", k = 12) + s(as.numeric(date), k = 20),
data = df, correlation = corARMA(form = ~ 1|year, p = 1))
predict(fit, newdata = df, type = "terms")[,2] + mean(max)
}
df_maximos <- df_maximos %>%
mutate(month = month(date),
year = year(date))%>%
group_by(pollutant) %>%
mutate(pred =fitted(max,month,date,year))
# Plot the daily highest level with trendline
ggplot(df_maximos,
aes(date, max, group = 1)) +
geom_point(color = "black", size = .2, alpha = .05) +
geom_line(aes(date, pred), color ="blue", size = 1.2) +
ggtitle("Daily maximum pollutant levels") +
ylab("maximum daily value") +
xlab("date") +
geom_vline(xintercept = as.numeric(as.Date("1989-11-20"))) +
geom_vline(xintercept = as.numeric(as.Date("2015-07-01"))) +
theme_bw() +
facet_wrap(~pollutant, scales = "free_y")
For measuring pollution Mexico City was divided into four zones. The function get_zone_data downloads the data for each zones as measured in IMECAs as oppesed to the original units that get_station_data uses. Note that the statndards for measuring PM10 and O3 in IMECAS changed in October 2014 and the function prints a warning.
# Download pm10 data since 2008 for all available zones ("TZ")
pm_10 <- get_zone_data(criterion = "MAXIMOS", # Can be MAXIMOS (daily maximum) or
# HORARIOS (hourly average)
pollutant = "PM10", # "SO2", "CO", "NO2", "O3", "PM10",
# "TC" (All pollutants)
zone = "TZ", # "NO", "NE", "CE", "SO", "SE", "TZ" (All zones)
start_date = "2010-01-01", # Can't be earlier than 2008-01-01
end_date = "2016-03-19") # Can be up to the current date
## Warning in get_zone_data(criterion = "MAXIMOS", pollutant = "PM10", zone = "TZ", :
## *******************
## Starting October 28, 2014 the index values for O3 and PM10 are computed using NOM-020-SSA1-2014 and NOM-025-SSA1-2014
## *******************
knitr::kable(head(pm_10))
| date | zone | pollutant | unit | value |
|---|---|---|---|---|
| 2010-01-01 | NO | PM10 | IMECA | 89 |
| 2010-01-02 | NO | PM10 | IMECA | 57 |
| 2010-01-03 | NO | PM10 | IMECA | 44 |
| 2010-01-04 | NO | PM10 | IMECA | 23 |
| 2010-01-05 | NO | PM10 | IMECA | 38 |
| 2010-01-06 | NO | PM10 | IMECA | 56 |
Plotting the data makes the change in October 2014 really obvious
# Plot the overall highest maximum pm10 level with trendline
ggplot(pm_10 %>% group_by(date) %>% summarise(max = max(value, na.rm = TRUE)),
aes(date, max, group = 1)) +
geom_line(color = "darkgray", size = .2) +
geom_smooth(method = "gam", formula = y ~ s(x, k = 50), se = FALSE) +
ggtitle("Daily maximum PM10 levels in IMECAS") +
theme_bw()
Here’s an example plotting the maximum pollution value for Ozone and PM10 by hour of day.
download_horarios <- function(pollutant) {
print(paste0("Downloading data for: ", pollutant))
o3 <- get_station_data(criterion = "HORARIOS", # Can be MAXIMOS (daily maximum),
# MINIMOS (daily minimum),
# or HORARIOS (hourly average)
pollutant = pollutant, # "SO2", "CO", "NOX", "NO2", "NO", "O3",
# "PM10", "PM25", "WSP", "WDR", "TMP", "RH"
year = 2001:2016) # The earliest year allowed is 1986
o3$pollutant <- pollutant
# Daily max among all base stations
o3 %>%
group_by(date, hour, pollutant) %>%
summarise(max = ifelse(all(is.na(value)),
NA,
base::max(value, na.rm = TRUE))) %>%
na.omit()
}
ll_horarios <- mapply(download_horarios,
pollutant = c("O3", "PM10"),
SIMPLIFY = FALSE)
## [1] "Downloading data for: O3"
## [1] "Downloading data for: PM10"
df_horarios <-bind_rows(ll_horarios)
The data returned by get_station_data when the “HORARIOS” criterion is used includes the date and hour of each measurement. The hour is specified as an offset from midnight and does not include daylight savings time (basically GMT+6).
knitr::kable(head(df_horarios))
| date | hour | pollutant | max |
|---|---|---|---|
| 2001-01-01 | 1 | O3 | 13 |
| 2001-01-01 | 2 | O3 | 22 |
| 2001-01-01 | 3 | O3 | 30 |
| 2001-01-01 | 4 | O3 | 29 |
| 2001-01-01 | 5 | O3 | 29 |
| 2001-01-01 | 6 | O3 | 26 |
# The time is given in hours with no DST
# GMT has no DST
df_horarios$datetime <- strptime(str_c(df_horarios$date, " ", df_horarios$hour),
"%Y-%m-%d %H", tz = "GMT+6") %>% as.POSIXct()
# Convert to MXC time
df_horarios$datetime <- as.POSIXct(format(df_horarios$datetime,
tz="America/Mexico_City",
usetz = TRUE))
df_horarios$hour_dst <- hour(df_horarios$datetime)
Ozone peaks after midday and PM10 particles peak with communiting hours. But for PM10 a phase I contingency is declared when the 24 hour average exceeds 150 IMECAS and for O3 when the hourly average exceeds 150 IMECAS (the data is shown in the original units of the data, not IMECAS)
df_horarios$hour_dst <- factor(df_horarios$hour_dst,
levels = c(5:23, 0:4),
ordered = TRUE)
ggplot(df_horarios,
aes(hour_dst, max, group = date, color = pollutant)) +
geom_line(alpha = I(1/sqrt((2016-2001)*365))) +
facet_wrap(~pollutant) +
coord_cartesian(ylim = c(0, 380)) +
guides(color = guide_legend("pollutant",
override.aes = list(alpha = 1))) +
theme_bw() +
xlab("hour of day") +
ylab("maximum pollution value among all stations") +
scale_x_discrete(breaks = seq(0, 23, 5)) +
ggtitle("Peak hours for O3 and PM10 pollutants")
We can use the function get_latest_data to download real-time data of the pollutant with the highest value.
library("ggmap")
library("viridis")
library("gstat")
library("sp")
mxc <- get_latest_data()
The package includes a data.frame with the locations of all stations
data("stations")
knitr::kable(head(stations))
| station_code | station_name | lon | lat | altitude | comment | station_id |
|---|---|---|---|---|---|---|
| ACO | Acolman | -98.91200 | 19.63550 | 2198 | 484150020109 | |
| AJM | Ajusco Medio | -99.20773 | 19.27185 | NA | NA | |
| INN | Investigaciones Nucleares | -99.39044 | 19.29373 | NA | NA | |
| MGH | Miguel Hidalgo | -99.20199 | 19.41071 | NA | NA | |
| GAM | Gustavo A. Madero | -99.09472 | 19.48249 | NA | NA | |
| AJU | Ajusco | -99.16261 | 19.15429 | 2942 | 484090120400 |
qmplot(lon, lat, data = stations, maptype = "toner-background")
but only a subset of stations report data
mxc <- na.omit(left_join(mxc, stations, by = "station_code"))
qmplot(lon, lat, data = mxc, maptype = "toner-background", zoom = 11) +
geom_point(aes(color = value), size = 8) +
scale_color_viridis()
We can improve on the dot plot of the data by creating agrid of Mexico City and using inverse distace weigthing to color each cell.
geog.o3 <- mxc[,c("lat", "lon", "value")]
coordinates(geog.o3) <- ~lon+lat
#spplot(geog.o3)
pixels = 50
geog.grd <- expand.grid(x=seq((min(coordinates(geog.o3)[,1])-.15),
(max(coordinates(geog.o3)[,1])+.15),
length.out=pixels),
y=seq((min(coordinates(geog.o3)[,2])-.15),
(max(coordinates(geog.o3)[,2])+.15),
length.out=pixels))
grd.pts <- SpatialPixels(SpatialPoints((geog.grd)))
grd.pts <- as(grd.pts, "SpatialGrid")
Here’s what the grid looks like:
plot(grd.pts, cex = 1.5, col = "grey")
points(geog.o3, pch = 1, col = "red", cex = 1)
geog.idw <- idw(value ~ 1, geog.o3, grd.pts, idp=6)
## [inverse distance weighted interpolation]
coloring the grid with the weigthed values:
spplot(geog.idw["var1.pred"])
idw = as.data.frame(geog.idw)
names(idw) <- c("var1.pred", "var1.var", "lon", "lat")
and finally plotting the grid on top of a Google Map
qmplot(x, y, data = geog.grd, geom = "blank",
maptype = "roadmap", source = "google") +
geom_tile(data = idw, aes(x = lon, y = lat, fill = var1.pred), alpha = .5) +
scale_fill_viridis("IMECAS", limits = c(0,200), option = "magma") +
geom_point(data = mxc, aes(x = lon, y = lat, color = quality), size = 2, alpha = .5) +
#scale_color_viridis("IMECAS", discrite = TRUE, limits = c(0,200), option = "magma") +
scale_color_manual(breaks = c("BUENA", "REGULAR", "MALA", "MUY MALA",
"EXTREMADAMENTE MALA"),
values = viridis(5, option = "plasma")) +
ggtitle("Pollution in MXC")
## Using zoom = 10...
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=19.438236,-99.088897&zoom=10&size=640x640&scale=2&maptype=roadmap&language=en-EN&sensor=false